{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-processing of the outputevents dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "file_path=\"~/Documents/Data/Full_MIMIC/\"\n",
    "\n",
    "pd.set_option('display.max_rows', 50)\n",
    "pd.set_option('display.max_columns', 300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "adm=pd.read_csv(file_path+\"Admissions_processed.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now consider the outputevents dataset. We select only the patients with the same criteria as above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs=pd.read_csv(file_path+\"OUTPUTEVENTS.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "23330\n"
     ]
    }
   ],
   "source": [
    "#Some checks\n",
    "assert(len(outputs.loc[outputs[\"ISERROR\"].notnull()].index)==0) #No entry with iserror==TRUE\n",
    "\n",
    "#Restrict the dataset to the previously selected admission ids only.\n",
    "adm_ids=list(adm[\"HADM_ID\"])\n",
    "outputs=outputs.loc[outputs[\"HADM_ID\"].isin(adm_ids)]\n",
    "\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs[\"SUBJECT_ID\"].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We load the D_ITEMS dataframe which contains the name of the ITEMID. And we merge both tables together."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "23330\n"
     ]
    }
   ],
   "source": [
    "#item_id \n",
    "item_id=pd.read_csv(file_path+\"D_ITEMS.csv\")\n",
    "item_id_1=item_id[[\"ITEMID\",\"LABEL\"]]\n",
    "item_id_1.head()\n",
    "\n",
    "#We merge the name of the item administrated.\n",
    "outputs_2=pd.merge(outputs,item_id_1,on=\"ITEMID\")\n",
    "outputs_2.head()\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs_2[\"SUBJECT_ID\"].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We compute the number of patients that have the specific outputs labels and we select only the features that are the most present over the whole data set. For this, we rank the features by number of patients and select the n_best."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "23201\n",
      "Number of datapoints remaining in the database: \n",
      "1758750\n",
      "LABEL\n",
      "Urine Out Foley                              12073\n",
      "Foley                                         9150\n",
      "Stool Out Stool                               4800\n",
      "Chest Tubes CTICU CT 1                        3483\n",
      "Gastric Oral Gastric                          3304\n",
      "OR Urine                                      3291\n",
      "Urine Out Void                                3228\n",
      "Void                                          3220\n",
      "OR Out PACU Urine                             2904\n",
      "Chest Tube #1                                 2527\n",
      "Pre-Admission Output Pre-Admission Output     2069\n",
      "Pre-Admission                                 1854\n",
      "Oral Gastric                                  1781\n",
      "OR Out OR Urine                               1723\n",
      "OR Out EBL                                    1720\n",
      "Name: SUBJECT_ID, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "n_best=15\n",
    "#For each item, evaluate the number of patients who have been given this item.\n",
    "pat_for_item=outputs_2.groupby(\"LABEL\")[\"SUBJECT_ID\"].nunique()\n",
    "#Order by occurence and take the 20 best (the ones with the most patients)\n",
    "frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]\n",
    "\n",
    "#Select only the time series with high occurence.\n",
    "outputs_3=outputs_2.loc[outputs_2[\"LABEL\"].isin(list(frequent_labels.index))].copy()\n",
    "\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs_3[\"SUBJECT_ID\"].nunique())\n",
    "print(\"Number of datapoints remaining in the database: \")\n",
    "print(len(outputs_3.index))\n",
    "\n",
    "print(frequent_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Eventually, we select the same labels of the paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "15087\n",
      "Number of datapoints remaining in the database: \n",
      "750817\n"
     ]
    }
   ],
   "source": [
    "outputs_label_list=['Gastric Gastric Tube','Stool Out Stool','Urine Out Incontinent','Ultrafiltrate Ultrafiltrate','Foley', 'Void','Condom Cath','Fecal Bag','Ostomy (output)','Chest Tube #1','Chest Tube #2','Jackson Pratt #1','OR EBL','Pre-Admission','TF Residual']\n",
    "outputs_bis=outputs_2.loc[outputs_2[\"LABEL\"].isin(outputs_label_list)].copy()\n",
    "\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs_bis[\"SUBJECT_ID\"].nunique())\n",
    "print(\"Number of datapoints remaining in the database: \")\n",
    "print(len(outputs_bis.index))\n",
    "\n",
    "outputs_3=outputs_bis.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cleaning of the output data\n",
    "\n",
    "### Units Cleaning\n",
    "\n",
    "#### 1) Amounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LABEL                        VALUEUOM\n",
       "Chest Tube #1                mL           58749\n",
       "Chest Tube #2                mL            7534\n",
       "Condom Cath                  mL            2392\n",
       "Fecal Bag                    mL            1263\n",
       "Foley                        mL          577219\n",
       "Gastric Gastric Tube         ml            1616\n",
       "Jackson Pratt #1             mL            8168\n",
       "OR EBL                       mL            1940\n",
       "Ostomy (output)              mL            1562\n",
       "Pre-Admission                mL            2530\n",
       "Stool Out Stool              ml           29354\n",
       "TF Residual                  mL           11871\n",
       "Ultrafiltrate Ultrafiltrate  ml           18463\n",
       "Urine Out Incontinent        ml             861\n",
       "Void                         mL           26802\n",
       "Name: VALUEUOM, dtype: int64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Verification that all input labels have the same amounts units.\n",
    "outputs_3.groupby(\"LABEL\")[\"VALUEUOM\"].value_counts() #OK"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check for outliers\n",
    "\n",
    "#### 1) In amounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LABEL</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Chest Tube #1</th>\n",
       "      <td>58749.0</td>\n",
       "      <td>37.339165</td>\n",
       "      <td>59.338525</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>2500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chest Tube #2</th>\n",
       "      <td>7534.0</td>\n",
       "      <td>31.527741</td>\n",
       "      <td>63.450023</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>1900.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Condom Cath</th>\n",
       "      <td>2392.0</td>\n",
       "      <td>244.961538</td>\n",
       "      <td>223.463992</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>200.0</td>\n",
       "      <td>340.0</td>\n",
       "      <td>2350.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fecal Bag</th>\n",
       "      <td>1263.0</td>\n",
       "      <td>447.680918</td>\n",
       "      <td>341.054355</td>\n",
       "      <td>0.0</td>\n",
       "      <td>200.0</td>\n",
       "      <td>375.0</td>\n",
       "      <td>700.0</td>\n",
       "      <td>2500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Foley</th>\n",
       "      <td>577219.0</td>\n",
       "      <td>127.286087</td>\n",
       "      <td>6026.738198</td>\n",
       "      <td>0.0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>4555555.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gastric Gastric Tube</th>\n",
       "      <td>1676.0</td>\n",
       "      <td>118.635442</td>\n",
       "      <td>156.237053</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>1600.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Jackson Pratt #1</th>\n",
       "      <td>8168.0</td>\n",
       "      <td>72.272857</td>\n",
       "      <td>100.280253</td>\n",
       "      <td>0.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>1325.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OR EBL</th>\n",
       "      <td>1940.0</td>\n",
       "      <td>694.788660</td>\n",
       "      <td>1499.586855</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>250.0</td>\n",
       "      <td>600.0</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ostomy (output)</th>\n",
       "      <td>1562.0</td>\n",
       "      <td>204.490397</td>\n",
       "      <td>169.469766</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>2000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pre-Admission</th>\n",
       "      <td>2530.0</td>\n",
       "      <td>632.887747</td>\n",
       "      <td>782.853310</td>\n",
       "      <td>-689.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>400.0</td>\n",
       "      <td>820.0</td>\n",
       "      <td>11000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stool Out Stool</th>\n",
       "      <td>7564.0</td>\n",
       "      <td>176.888154</td>\n",
       "      <td>237.387719</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>250.0</td>\n",
       "      <td>3500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TF Residual</th>\n",
       "      <td>11871.0</td>\n",
       "      <td>32.025187</td>\n",
       "      <td>60.491895</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>880.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ultrafiltrate Ultrafiltrate</th>\n",
       "      <td>18489.0</td>\n",
       "      <td>204.264855</td>\n",
       "      <td>330.029101</td>\n",
       "      <td>0.0</td>\n",
       "      <td>79.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>239.0</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Urine Out Incontinent</th>\n",
       "      <td>162.0</td>\n",
       "      <td>63.993827</td>\n",
       "      <td>109.968516</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>600.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Void</th>\n",
       "      <td>26802.0</td>\n",
       "      <td>303.662413</td>\n",
       "      <td>197.938023</td>\n",
       "      <td>-1500.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>260.0</td>\n",
       "      <td>400.0</td>\n",
       "      <td>2000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                count        mean          std     min    25%  \\\n",
       "LABEL                                                                           \n",
       "Chest Tube #1                 58749.0   37.339165    59.338525     0.0   10.0   \n",
       "Chest Tube #2                  7534.0   31.527741    63.450023     0.0    0.0   \n",
       "Condom Cath                    2392.0  244.961538   223.463992     0.0  100.0   \n",
       "Fecal Bag                      1263.0  447.680918   341.054355     0.0  200.0   \n",
       "Foley                        577219.0  127.286087  6026.738198     0.0   45.0   \n",
       "Gastric Gastric Tube           1676.0  118.635442   156.237053     0.0   15.0   \n",
       "Jackson Pratt #1               8168.0   72.272857   100.280253     0.0   20.0   \n",
       "OR EBL                         1940.0  694.788660  1499.586855     0.0  100.0   \n",
       "Ostomy (output)                1562.0  204.490397   169.469766     0.0  100.0   \n",
       "Pre-Admission                  2530.0  632.887747   782.853310  -689.0  150.0   \n",
       "Stool Out Stool                7564.0  176.888154   237.387719     0.0    0.0   \n",
       "TF Residual                   11871.0   32.025187    60.491895     0.0    0.0   \n",
       "Ultrafiltrate Ultrafiltrate   18489.0  204.264855   330.029101     0.0   79.0   \n",
       "Urine Out Incontinent           162.0   63.993827   109.968516     0.0    0.0   \n",
       "Void                          26802.0  303.662413   197.938023 -1500.0  175.0   \n",
       "\n",
       "                               50%    75%        max  \n",
       "LABEL                                                 \n",
       "Chest Tube #1                 30.0   50.0     2500.0  \n",
       "Chest Tube #2                 20.0   40.0     1900.0  \n",
       "Condom Cath                  200.0  340.0     2350.0  \n",
       "Fecal Bag                    375.0  700.0     2500.0  \n",
       "Foley                         80.0  150.0  4555555.0  \n",
       "Gastric Gastric Tube          60.0  175.0     1600.0  \n",
       "Jackson Pratt #1              40.0   80.0     1325.0  \n",
       "OR EBL                       250.0  600.0    20000.0  \n",
       "Ostomy (output)              175.0  300.0     2000.0  \n",
       "Pre-Admission                400.0  820.0    11000.0  \n",
       "Stool Out Stool              100.0  250.0     3500.0  \n",
       "TF Residual                   10.0   30.0      880.0  \n",
       "Ultrafiltrate Ultrafiltrate  149.0  239.0     5000.0  \n",
       "Urine Out Incontinent          0.0  100.0      600.0  \n",
       "Void                         260.0  400.0     2000.0  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs_3.groupby(\"LABEL\")[\"VALUE\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "15057\n",
      "Number of datapoints remaining in the database: \n",
      "749738\n"
     ]
    }
   ],
   "source": [
    "#Remove all entries whose rate is more than 4 std away from the mean.\n",
    "out_desc=outputs_3.groupby(\"LABEL\")[\"VALUE\"].describe()\n",
    "name_list=list(out_desc.loc[out_desc[\"count\"]!=0].index)\n",
    "for label in name_list:\n",
    "    outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==label)&(outputs_3[\"VALUE\"]>(out_desc.loc[label,\"mean\"]+4*out_desc.loc[label,\"std\"]))].index).copy()\n",
    "\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs_3[\"SUBJECT_ID\"].nunique())\n",
    "print(\"Number of datapoints remaining in the database: \")\n",
    "print(len(outputs_3.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of patients remaining in the database: \n",
      "12303\n",
      "Number of datapoints remaining in the database: \n",
      "726834\n"
     ]
    }
   ],
   "source": [
    "#Clean Foley, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Foley\") & (outputs_3[\"VALUE\"]>5500)].index).copy()\n",
    "#Clean Expected Blood Loss, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"OR EBL\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean Out Expected Blood Loss, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"OR Out EBL\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean OR Urine, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"OR Urine\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean Pre-Admission, remove too large and negative values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Pre-Admission\") & (outputs_3[\"VALUE\"]<0)].index).copy()\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Pre-Admission\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean Pre-Admission output, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Pre-Admission Output Pre-Admission Output\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean Urine Out Foley output, remove too large values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Urine Out Foley\") & (outputs_3[\"VALUE\"]>5000)].index).copy()\n",
    "#Clean Void, remove negative values\n",
    "outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3[\"LABEL\"]==\"Void\") & (outputs_3[\"VALUE\"]<0)].index).copy()\n",
    "\n",
    "outputs_3.dropna(subset=[\"VALUE\"],inplace=True)\n",
    "\n",
    "print(\"Number of patients remaining in the database: \")\n",
    "print(outputs_3[\"SUBJECT_ID\"].nunique())\n",
    "print(\"Number of datapoints remaining in the database: \")\n",
    "print(len(outputs_3.index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As data is already in timestamp format, we don't neeed to consider rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs_3.to_csv(file_path+\"OUTPUTS_processed.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
